#define vec2 float2
#define vec3 float3
#define vec4 float4
#define rgb xyz
#define rgba xyzw

#define edgeMax 0.888f
#define edgeMin 0.288f

const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;

vec4 INPUTSRC(image2d_t src_data,__global FilterParam* param, vec2 tc)
{
	tc = (vec2)(tc.x, tc.y)*(vec2)(param->origROI[2], param->origROI[3]) + (vec2)(param->origROI[0], param->origROI[1]);
	return read_imagef(src_data, sampler, tc);
}

vec4 rgb2cmyk(vec3 color)
{
	float val = max(max(color.x, color.y), color.z);
	return min((vec4)(color.xyz / val, val), 1.0f);
}

vec3 cmyk2rgb(vec4 color)
{
	return color.xyz * color.w;
}

vec2 myMod(vec2 x, float y){
    return (x - y * floor(x/y));
}

vec2 makeGrid(vec2 tc, float S)
{
	return tc - myMod(tc,S);
}

vec4 sm(vec4 v)
{
	return smoothstep((vec4)(edgeMax-edgeMin), (vec4)(edgeMax+edgeMin), v);
}

vec4 tone2(image2d_t src_data,__global FilterParam* param, vec2 fc, vec4 m, vec2 resolution, float S)
{
	vec2 temp = (vec2)(m.x*fc.x+m.z*fc.y,m.y*fc.x+m.w*fc.y);
	vec2 smp = (makeGrid(temp, S) + 0.5f*S);
	smp = (vec2)(smp.x * m.x + smp.y * m.y, smp.x * m.z + smp.y * m.w);
	float s = min(length(fc-smp) / (1.48f*0.5f*S), 1.0f);
    vec3 ori = INPUTSRC(src_data, param, (smp + 0.5f * resolution)/resolution.xy).xyz;
    ori = pow(ori, (vec3)(2.2f));
	vec4 c = rgb2cmyk(ori);
	return c+s;
}

vec4 rt(float r)
{
	float cr = cos(r);
	float sr = sin(r);
	return (vec4)(cr,-sr,sr,cr);
}

vec4 tone(image2d_t src_data, __global FilterParam* param, vec2 fragCoord, vec2 resolution, float iGlobalTime,float speed, float size)
{
    vec4 fragColor = (vec4)(1.0f);
    float theta = speed*0.333f*iGlobalTime;
	
	vec2 fc = fragCoord.xy - 0.5f * resolution.xy;
	
	vec4 mc = rt(theta + radians(15.0f));
	vec4 mm = rt(theta + radians(75.0f));
	vec4 my = rt(theta);
	vec4 mk = rt(theta + radians(45.0f));
	
	vec3 c = cmyk2rgb(sm((vec4)(
		tone2(src_data, param, fc, mc, resolution, size).x,
		tone2(src_data, param, fc, mm, resolution, size).y,
		tone2(src_data, param, fc, my, resolution, size).z,
		tone2(src_data, param, fc, mk, resolution, size).w
	)));
    
    c = pow(c, (vec3)(1.0f/2.2f)); // Gamma encode.
	fragColor = (vec4)(c, 1.0f);
    return fragColor;
}

float normpdf(float x, float sigma) {
	return 0.39894f * exp(-0.5f * x * x/ (sigma * sigma)) / sigma;
}

float normpdf3(vec3 v, float sigma) {
	return 0.39894f * exp(-0.5f * dot(v,v) / (sigma * sigma)) / sigma;
}

float normalizeColorChannel(float value, float min, float max) {
    return (value - min)/(max-min);
}

vec4 bilateral(image2d_t src_data,__global FilterParam* param, vec2 fragCoord, vec2 resolution) {
	vec4 fragColor = (vec4)(1.0f);
    vec3 input = INPUTSRC(src_data, param, fragCoord.xy / resolution.xy).xyz;
    int size = 7;
    vec3 retColor = (vec3)(0.0f);
    float sum = 0.0f;
    
    float nf = 1.0f / normpdf(0.0f, 0.1f);
  
    for (int i=-size; i <= size; ++i) {
        for (int j=-size; j <= size; ++j) {
            vec3 color = INPUTSRC(src_data, param,(fragCoord.xy + (vec2)((float)(i),(float)(j))) / resolution.xy).xyz;
            float factor = normpdf3(color - input, 0.1f) * nf * normpdf((float)(j), 10.0f) * normpdf((float)(i), 10.0f);
            sum += factor;
            retColor += factor * color;
        }
    }
    
    fragColor = (vec4)(retColor / sum, 1.0f);
    
    return fragColor;
}

vec3 YCoCr3(vec3 color) {
	return (vec3)(color.x*0.25f+color.y*(-0.25f)+color.z*0.5f,
	color.x*0.5f+color.y*0.5f,
	color.x*0.25f+color.y*(-0.25f)+color.z*(-0.5f));
}

vec3 YCoCr(image2d_t src_data, __global FilterParam* param, vec2 uv) {
	return YCoCr3(INPUTSRC(src_data, param, uv).zyx);
}

float calcSobel(vec3 I0, vec3 I1, vec3 I2) {
	vec3 sx0 = (vec3)(1.0f, 0.0f, -1.0f);
	vec3 sx1 = (vec3)(2.0f, 0.0f, -2.0f);
	vec3 sx2 = (vec3)(1.0f, 0.0f, -1.0f);
	
	vec3 sy0 = (vec3)(1.0f, 2.0f, 1.0f);
	vec3 sy1 = (vec3)(0.0f, 0.0f, 0.0f);
	vec3 sy2 = (vec3)(-1.0f, -2.0f, -1.0f);
	
	float gx = dot(sx0, I0) + dot(sx1, I1) + dot(sx2, I2); 
	float gy = dot(sy0, I0) + dot(sy1, I1) + dot(sy2, I2);

	return sqrt(gx * gx + gy * gy);
}

vec3 sobel(image2d_t src_data, __global FilterParam* param, vec2 uv, vec2 resolution, float scale_x, float scale_y) {
    float Y[9];
    float Co[9];
    float Cr[9];
    
    for (int i=0; i<3; i++) {
        for (int j=0; j<3; j++) {
        	vec2 pos = uv + (vec2)((float)(i-1) / resolution.x * scale_x, (float)(j-1) / resolution.y * scale_y);
            vec3 temp = YCoCr(src_data, param, pos);
            Y[i*3+j] = temp.x;
            Co[i*3+j] = temp.y;
            Cr[i*3+j] = temp.z;
	    }
	}
	
	vec3 Y0 = (vec3)(Y[0],Y[3],Y[6]);
	vec3 Y1 = (vec3)(Y[1],Y[4],Y[7]);
	vec3 Y2 = (vec3)(Y[2],Y[5],Y[8]);
	
	vec3 C0 = (vec3)(Co[0],Co[3],Co[6]);
	vec3 C1 = (vec3)(Co[1],Co[4],Co[7]);
	vec3 C2 = (vec3)(Co[2],Co[5],Co[8]);
	
	vec3 Cr0 = (vec3)(Cr[0],Cr[3],Cr[6]);
	vec3 Cr1 = (vec3)(Cr[1],Cr[4],Cr[7]);
	vec3 Cr2 = (vec3)(Cr[2],Cr[5],Cr[8]);
	
	return (vec3)(calcSobel(Y0,Y1,Y2), calcSobel(C0,C1,C2), calcSobel(Cr0,Cr1,Cr2));
}

__kernel void MAIN(
      __read_only image2d_t src_data,  
      __write_only image2d_t dest_data,
      __global FilterParam* param,
	  int size,
	  int speed,
	  int alpha) 
{
    int W = get_global_size(0);
	int H = get_global_size(1);
	
	int w = get_global_id(0);
	int h = get_global_id(1);
	vec2 resolution = (vec2)(W,H);
	int2 gl_FragCoord = (int2)(get_global_id(0), get_global_id(1));
	vec2 fragCoord = (vec2)(get_global_id0(param), get_global_id1(param));
	vec2 tc = ((vec2)(fragCoord.x, fragCoord.y) + (vec2)(0.5f))/resolution.xy;
	
	vec4 orig = INPUTSRC(src_data, param, tc);
	
	float iGlobalTime = param->cur_time;
	vec4 blColor = bilateral(src_data, param, fragCoord, resolution);
	float speedf = (float)(speed) * 0.1f;
	float sizef = (float)(size);
	float scale_x = resolution.x / 640.0f;
	float scale_y = resolution.y / 433.0f;
	sizef *= scale_x;
	vec4 toneColor = tone(src_data, param, fragCoord, resolution, iGlobalTime, speedf, sizef);
    vec3 sobe_edge = sobel(src_data, param, tc, resolution, scale_x, scale_y);
    vec4 fragColor = (vec4)((vec3)(clamp((sobe_edge.x + sobe_edge.y + sobe_edge.z)/3.0f, 0.0f, 1.0f)), 1.0f);
    if (fragColor.x < 0.05f) {
        fragColor = (vec4)(0.0f);
    } 
	
    vec3 retColor = fragColor.xyz;
    fragColor.xyz = (toneColor.xyz + blColor.xyz) / 2.0f;
    fragColor.xyz -= retColor;
	fragColor.xyz = clamp(fragColor.xyz, 0.0f, 1.0f);
	fragColor.w = orig.w;
	
    write_imagef(dest_data,gl_FragCoord, mix(orig, fragColor, (float)(alpha)/100.0f));
}
